import pandas as pd
df=pd.read_excel('旅游网站精华游记数据.xlsx')
def dealPlace(place):
    s=''
    if type(place) == str:
        for c in place:
            if not (((c>'a') and (c<='z')
                or((c>'A')) and (c<'z'))):
    else:
        s = place
    return s
def dealView(view):
    num = view
    if type(num) == str:
        if '万' in num:
            if '.' in num:
                num = num.replace('.', '').replace('万', '000')
            else:
                num = num.replace('万', '0000')
    return num
df['出发日期']=df['出发日期'].str.split(expand=True)[0]
df['天数']=df['天数'].str.slice(1,-1).astype('int')
df['人均消费（元）']=df['人均消费（元）'].str.slice(2,-1)
df['途经地点']=df['途经地点'].apply(lambda x: dealPlace(x))
df['途经地点']=df['途经地点'].str.replace('途径:','',regex=False).str.lreplace('>','、',regex=False)
df['阅览数']=df['阅览数'].apply(lambda x: dealView(x)).astype('int')
print(df[['出生日期','天数','人均消费（元）','阅览数','途经地点']])
##


df1=df.duplicated(subset=['标题'])
pirnt('除包含重复值的第一行外，其他包含重复值标记为True的行:\n',df1[df1==True])
print('删除重复值前数据的行数:',len(df))
df.drop_duplicates(subset=['标题'],inplace=True,ignore_index=True)
print('删除重复值后数据的行数:',len(df))

#

df2=df.T.isnull().sum()
print('缺失值个数大于2的行:\n,df2[df2>2]')
print('删除缺失值前后的行数:',len(df))
df.dropna(how='all',thresh=5,inplace=True)
print('删除缺失值后数据的行数:'len(df))

#

df['月份']=pd.to_datetime(df['出发日期']).dt.month
print(df[['出发日期','月份']].head(20))
#
df.to_excel('旅游网站精华游记数据_预处理.xlsx',index=False)
#
